# load "Ask a A Manager 2021 Survey" googlesheet
# https://www.askamanager.org/
ask_a_manager_2021 <- read_csv(here::here("data","ask_a_manager.csv"))
skimr::skim(ask_a_manager_2021)
| Name | ask_a_manager_2021 |
| Number of rows | 26765 |
| Number of columns | 18 |
| _______________________ | |
| Column type frequency: | |
| character | 14 |
| logical | 2 |
| numeric | 1 |
| POSIXct | 1 |
| ________________________ | |
| Group variables | None |
Variable type: character
| skim_variable | n_missing | complete_rate | min | max | empty | n_unique | whitespace |
|---|---|---|---|---|---|---|---|
| how_old_are_you | 0 | 1.00 | 5 | 10 | 0 | 7 | 0 |
| industry | 62 | 1.00 | 2 | 171 | 0 | 1084 | 0 |
| job_title | 0 | 1.00 | 1 | 126 | 0 | 12838 | 0 |
| additional_context_on_job_title | 19868 | 0.26 | 1 | 781 | 0 | 6612 | 0 |
| currency | 0 | 1.00 | 3 | 7 | 0 | 11 | 0 |
| additional_context_on_income | 23851 | 0.11 | 1 | 1143 | 0 | 2839 | 0 |
| country | 0 | 1.00 | 1 | 209 | 0 | 297 | 0 |
| state | 4761 | 0.82 | 4 | 114 | 0 | 125 | 0 |
| city | 23 | 1.00 | 1 | 171 | 0 | 4070 | 0 |
| overall_years_of_professional_experience | 0 | 1.00 | 9 | 16 | 0 | 8 | 0 |
| years_of_experience_in_field | 0 | 1.00 | 9 | 16 | 0 | 8 | 0 |
| highest_level_of_education_completed | 202 | 0.99 | 3 | 34 | 0 | 6 | 0 |
| gender | 155 | 0.99 | 3 | 29 | 0 | 5 | 0 |
| race | 151 | 0.99 | 5 | 125 | 0 | 47 | 0 |
Variable type: logical
| skim_variable | n_missing | complete_rate | mean | count |
|---|---|---|---|---|
| other_monetary_comp | 26765 | 0 | NaN | : |
| currency_other | 26765 | 0 | NaN | : |
Variable type: numeric
| skim_variable | n_missing | complete_rate | mean | sd | p0 | p25 | p50 | p75 | p100 | hist |
|---|---|---|---|---|---|---|---|---|---|---|
| annual_salary | 0 | 1 | 144988 | 5488158 | 0 | 54000 | 75712 | 110000 | 8.7e+08 | ▇▁▁▁▁ |
Variable type: POSIXct
| skim_variable | n_missing | complete_rate | min | max | median | n_unique |
|---|---|---|---|---|---|---|
| timestamp | 0 | 1 | 2021-04-27 11:02:09 | 2021-09-14 21:55:44 | 2021-04-28 12:35:21 | 23989 |
After skimming the dataset, we took the below steps to clean the raw data :
# remove the NA values and 0 salaries
ask_a_manager_2021 <- ask_a_manager_2021 %>%
filter(!gender %in% c("NA","Other or prefer not to answer","Prefer not to answer", "Non-binary", NA) &
race!="NA" &
highest_level_of_education_completed!="NA"&
annual_salary>0)
# using z-score. remove data out of 3 sigma
# ask_a_manager_2021 <- ask_a_manager_2021[-which(abs(scale(ask_a_manager_2021$annual_salary))>3),]
# using quantile. remove data larger than 75%+1.5*IQR or smaller than 25%-1.5*IQR
# we choose this method recommended by the stats community
Q <- quantile(ask_a_manager_2021$annual_salary, probs=c(.25, .75))
iqr <- IQR(ask_a_manager_2021$annual_salary)
up <- Q[2]+1.5*iqr # Upper Range
low <- Q[1]-1.5*iqr # Upper Range
ask_a_manager_2021 <- ask_a_manager_2021 %>%
filter(annual_salary>=low & annual_salary<=up)
We observe there are only 2 samples under 18 years old correctly entered their data, so we remove them to make analysis between age groups possible.
ask_a_manager_2021 %>%
group_by(how_old_are_you, gender) %>%
summarise(count=n()) %>%
ggplot(aes(x=fct_relevel(how_old_are_you,levels=c("under 18","18-24","25-34","35-44","45-54","55-64","65 or over")),y=count))+
geom_col()+
labs(title="Sample size in age groups",
x="Age Group",
y="number of samples")+
NULL
ask_a_manager_2021 <- ask_a_manager_2021 %>%
filter(how_old_are_you!="under 18")
factor datatypeNamely, we want to make age, years of working experience, education into factor, and take the log of salary for the next step of EDA.
ask_a_manager_2021 <- ask_a_manager_2021 %>%
mutate(
country = countrycode::countryname(country),
age_group = factor(how_old_are_you,levels=c("18-24","25-34","35-44","45-54","55-64","65 or over")),
field_exp = factor(years_of_experience_in_field,levels=c("1 year or less","2 - 4 years","5-7 years","8 - 10 years","11 - 20 years","21 - 30 years","31 - 40 years","41 years or more")),
pro_exp = factor(overall_years_of_professional_experience,levels=c("1 year or less","2 - 4 years","5-7 years","8 - 10 years","11 - 20 years","21 - 30 years","31 - 40 years","41 years or more")),
education = factor(highest_level_of_education_completed, levels=c("High School","Some college","College degree","Master's degree","Professional degree (MD, JD, etc.)", "PhD")),
higher_edu = ifelse(education %in% c("College degree","Master's degree", "PhD", "Professional degree (MD, JD, etc.)"),1,0),
salary = annual_salary,
# take ln() to annual salary
log_salary = log(annual_salary)
) %>%
janitor::clean_names() # clean columns names
# there are people under 18 having more than 18 years of experience
ask_a_manager_2021 %>%
filter(age_group=="under 18") %>%
select(c("age_group","overall_years_of_professional_experience"))
| age_group | overall_years_of_professional_experience |
|---|
ask_a_manager_2021 <- ask_a_manager_2021 %>%
mutate(max_age = case_when(how_old_are_you=="under 18"~18,
how_old_are_you=="65 or over"~999,
T~as.numeric(substr(how_old_are_you,4,5))),
# parse the first number in a character
min_exp = if_else(parse_number(overall_years_of_professional_experience)>
parse_number(years_of_experience_in_field),
parse_number(overall_years_of_professional_experience),
parse_number(years_of_experience_in_field))) %>%
# delete the problematic entries and the tow columns created here.
filter(max_age>=min_exp) %>%
select(-c("max_age","min_exp"))
Since there are over 1000 reported industries in the dataset, we will only keep the top 25 industries which account for 92% of total entries
# Industry is messy... it has > 1000 different industries.
# we only keep the names of top 25 industries. (24597(92%) out of 26765 entries)
industry_list <- ask_a_manager_2021 %>%
count(industry, sort=TRUE) %>%
mutate(percent = 100* n/sum(n)) %>%
slice_max(order_by = n, n = 25) %>%
select(industry)
ask_a_manager_2021 <- ask_a_manager_2021 %>%
mutate(industry = ifelse(industry %in% industry_list$industry,
industry,
"Others"))
Since there are 11 different currencies reported in the dataset, we want to unify the most common currencies into USD for the convenience of comparison analysis afterwards. We decided to convert all currencies with occurences greater than 100 into USD. This included CAD,GBP,EUR,AUD/NZD. Other was not converted as there wasn’t specific information on the currency.
ask_a_manager_2021 %>%
count(currency, sort=TRUE) %>%
mutate(percent = 100* n/sum(n))
| currency | n | percent |
|---|---|---|
| USD | 20268 | 83.6 |
| CAD | 1474 | 6.08 |
| GBP | 1434 | 5.91 |
| EUR | 551 | 2.27 |
| AUD/NZD | 432 | 1.78 |
| Other | 57 | 0.235 |
| CHF | 29 | 0.12 |
| SEK | 1 | 0.00412 |
From the above table, We can see that USD, CAD, GBP, EUR, (AUD/NZD) are the most common currencies. We will convert the currencies all to USD using exchange rate data from library quantmod, and the exchange rate will be taken from the last date of the survey “2021-09-14”
#gets the currency rate for the currency pairs. This uses the Oanda which keeps currency data for the last 180 days.
#In 180 days from "2021-09-14" the code will no longer return the currency rates.
currencies <-getSymbols(c("EUR/USD","CAD/USD","GBP/USD","AUD/USD" , "NZD/USD"),
from="2021-09-14", to="2021-09-14", src="oanda")
#In order to preserve the analysis for after 180 days we can hard code the currency rates into a tibble
currencies_preserved <- tibble(
name = c("EUR/USD","CAD/USD","GBP/USD","AUD/USD" , "NZD/USD"),
value = c(1.18,0.79,1.38,0.734,0.711)
)
Here we create a new column called converted_salary which converts the 5 most common currencies in USD. These are then rounded to the nearest whole number.
#the values are taken from the currencies_preserved$value as to preserve the analysis when
# getSymbols will no longer work. The index matches the currency pair.
ask_a_manager_2021 <- ask_a_manager_2021 %>%
mutate(converted_salary = case_when(
currency == "GBP" ~ round(annual_salary*currencies_preserved$value[3]),
currency == "AUD/NZD" & country == "Australia" ~ round(annual_salary*currencies_preserved$value[4]),
currency == "AUD/NZD" & country == "New Zealand" ~ round(annual_salary*currencies_preserved$value[5]),
currency == "CAD" ~ round(annual_salary*currencies_preserved$value[2]),
currency == "EUR" ~ round(annual_salary*currencies_preserved$value[1]),
TRUE ~ round(annual_salary)
))
#Here we filter the dataset to only include the salaries that were converted into USD terms. The other currencies also included lots of outliers which we also get rid of here.
ask_a_manager_2021 <- ask_a_manager_2021 %>%
filter(currency %in% c("USD","GBP","AUD/NZD","EUR","CAD") &
!is.na(country))
ask_a_manager_2021 %>%
count(currency, sort=TRUE) %>%
mutate(percent = 100* n/sum(n))
| currency | n | percent |
|---|---|---|
| USD | 20128 | 84.1 |
| CAD | 1464 | 6.12 |
| GBP | 1362 | 5.69 |
| EUR | 547 | 2.29 |
| AUD/NZD | 428 | 1.79 |
# Some quick counts, groups, etc
ask_a_manager_2021 %>%
count(age_group, sort=TRUE) %>%
mutate(percent = 100* n/sum(n)) %>%
ggplot(aes(x=fct_relevel(age_group,levels=c("under 18","18-24","25-34","35-44","45-54","55-64","65 or over")), y=n)) +
geom_bar(stat="identity") +
labs(title = "Age distribution of Ask a Manager survey respondents",
x="Age group",
y="Frequency",
caption = "Source:Askamanager.com")
# 'country'
ask_a_manager_2021 %>%
count(country, sort=TRUE) %>%
mutate(percent = 100* n/sum(n))
# 'city'
ask_a_manager_2021 %>%
count(city, sort=TRUE) %>%
mutate(percent = 100* n/sum(n))
# education
ask_a_manager_2021 %>%
count(education) %>%
mutate(percent = 100* n/sum(n))%>%
arrange(desc(percent))
# gender
ask_a_manager_2021 %>%
count(gender) %>%
mutate(percent = 100* n/sum(n))%>%
arrange(desc(percent))
# race
ask_a_manager_2021 %>%
count(race) %>%
mutate(percent = 100* n/sum(n))%>%
arrange(desc(percent))
# pro_exp
ask_a_manager_2021 %>%
count(pro_exp ) %>%
mutate(percent = 100* n/sum(n))%>%
arrange(desc(percent))
# field_exp
ask_a_manager_2021 %>%
count(field_exp) %>%
mutate(percent = 100* n/sum(n))%>%
arrange(desc(percent))
#make comments about the counts
How is salary distributed?
favstats(ask_a_manager_2021$salary) # find a really rich guy here
| min | Q1 | median | Q3 | max | mean | sd | n | missing |
|---|---|---|---|---|---|---|---|---|
| 1 | 5.4e+04 | 7.5e+04 | 1.04e+05 | 1.92e+05 | 8.18e+04 | 3.75e+04 | 23929 | 0 |
# density
g1 <- ggplot(ask_a_manager_2021, aes(x=salary))+
geom_density()+
NULL
#cdf
g2 <- ask_a_manager_2021 %>%
slice_min(order_by = salary,n =nrow(ask_a_manager_2021)-5) %>% # choose a number
ggplot(aes(x=salary))+
stat_ecdf()+
NULL
# taking log is better
g3 <- ggplot(ask_a_manager_2021, aes(x=log(salary)))+
geom_density()+
NULL
g4 <- ask_a_manager_2021 %>%
slice_min(order_by = salary,n =nrow(ask_a_manager_2021)-5) %>% # choose a number
ggplot(aes(x=log(salary)))+
stat_ecdf()+
NULL
gridExtra::grid.arrange(g1,g2,g3,g4)
Observation: This is a right skewed distribution.
ask_a_manager_2021 %>%
count(industry, sort=TRUE) %>%
ggplot(aes(y=fct_reorder(industry,n), x=n))+
geom_col()
Observations:
We are interested in the type of industries with top salary for man and woman.
# Industries of top average salary for Man and Woman
salary_table <- ask_a_manager_2021 %>%
group_by(industry, gender) %>%
summarise(count = n(),
avg_salary = mean(converted_salary),
higher_edu_prop = sum(higher_edu)/n()) %>%
pivot_wider(id_cols = 1:5,
names_from = gender,
values_from = count:higher_edu_prop)
g_man <- salary_table %>%
ggplot(aes(y=fct_reorder(industry,avg_salary_Man), x=avg_salary_Man,fill=higher_edu_prop_Man))+
geom_bar(position="dodge", stat="identity")+
geom_text(aes(label=paste(avg_salary_Man%/%1000,"k")),
position=position_dodge(width=0.9), hjust=0,vjust=0)+
scale_x_continuous(labels = scales::comma) +
scale_fill_gradient(low="sky blue", high="blue")+
labs(
title="Man",
x="Annual Salary $",
y="Industry",
fill="Higher Education %"
)+
theme_wsj()+
theme(legend.position="bottom",
legend.title=element_text(size=13))+
NULL
ggsave("highest_paid_industries_male.png",width = 12,height = 8,limitsize = F)
g_woman <- salary_table %>%
ggplot(aes(y=fct_reorder(industry,avg_salary_Woman), x=avg_salary_Woman,fill=higher_edu_prop_Woman))+
geom_bar( position="dodge", stat="identity")+
geom_text(aes(label=paste(avg_salary_Woman%/%1000,"k")),
position=position_dodge(width=0.9), hjust=0,vjust=0)+
scale_x_continuous(labels = scales::comma) +
scale_fill_gradient(low="pink", high="red")+
labs(
title="Woman",
x="Annual Salary $",
y="Industry",
fill="Higher Education %"
)+
theme_wsj()+
theme(legend.position="bottom",
legend.title=element_text(size=13))+
NULL
ggsave("highest_paid_industries_female.png",width = 12,height = 8,limitsize = F)
g_man
g_woman
Observations:
We are also interested in the salary difference between man and woman within the same industry.
# salary gap in different industries
ask_a_manager_2021 %>%
group_by(industry, gender) %>%
summarise(count = n(),
avg_salary = mean(converted_salary)) %>%
pivot_wider(id_cols = 1:4,
names_from = gender,
values_from = count:avg_salary) %>%
mutate(size = count_Man+count_Woman,
male_prop = count_Man/(count_Man+count_Woman),
salary_gap = avg_salary_Man - avg_salary_Woman) %>%
ggplot(aes(y=fct_reorder(industry,salary_gap), x=salary_gap,fill=male_prop))+
geom_col()+
# geom_point(aes(y=fct_reorder(industry,size),x=size*10))+
scale_x_continuous(
breaks = seq(-20000,40000,5000)
) +
scale_fill_gradient(low="pink", high="blue")+
labs(
title="Salary Gap and Sex Ratio",
x="Salary Gap in $",
y="",
fill="Male %"
)+
theme_bw()+
theme(legend.position="bottom",
legend.title=element_text(size=8))+
NULL
ggsave("salary_gap.png",width = 12,height = 8,limitsize = F)
Observations:
Next, we want to see if there is a difference in the salary gap between man and woman for different education level.
ask_a_manager_2021 %>%
group_by(education, gender) %>%
summarise(count = n(),
avg_salary = mean(converted_salary)) %>%
pivot_wider(id_cols = 1:4,
names_from = gender,
values_from = count:avg_salary) %>%
mutate(size = count_Man+count_Woman,
male_prop = count_Man/(count_Man+count_Woman),
salary_gap = avg_salary_Man - avg_salary_Woman) %>%
ggplot(aes(x=education, y=salary_gap,fill=male_prop))+
geom_col()+
scale_y_continuous(
breaks = seq(-20000,40000,5000)
) +
scale_fill_gradient(low="pink", high="sky blue")+
theme_bw()+
theme(legend.position="bottom", axis.title=element_text(size=8))+
ggtitle("Salary Gap at Different Education level") +
xlab("Education Level") + ylab ("Salary Gap in $") +
scale_x_discrete(labels = function(x) str_wrap(x, width = 10))+
NULL
ggsave("salary_gap_by_edu.png",width = 14,height = 8,limitsize = F)
ask_a_manager_2021 %>%
filter(gender %in% c("Man", "Woman")) %>%
mutate(sex = factor (gender, labels = c("Man", "Woman"))) %>%
rename(Education = education) %>%
group_by(Education, gender) %>%
summarise (median_salary_by_edu = median (converted_salary),
mean_salary_by_edu = mean(converted_salary), ) %>%
ggplot () +
geom_col(aes (Education, median_salary_by_edu, fill = gender), position = position_dodge(width = 0.9), alpha = 0.7)+
#Removed the section which broke the code
scale_x_discrete(labels = function(x) str_wrap(x, width = 10))+
NULL
ask_a_manager_2021 %>%
group_by(race) %>%
filter(n() > 500) %>%
summarise(count=n()) %>%
ggplot(aes(y=count, x=race, fill=race))+
geom_col() +
theme_wsj()+
theme(legend.position = "none") +
xlab("Race") + ylab ("Count") +
labs(title="Count by Race", subtitle = "Races with > 500 Respondents")+
scale_x_discrete(labels = function(x) str_wrap(x, width = 10)) +
NULL
ask_a_manager_2021 %>%
filter(industry %in% industry_list$industry) %>%
group_by(race) %>%
filter(n() > 500) %>%
ggplot( aes(x=race, y=converted_salary, color=race)) +
geom_jitter(color="black", size=0.01, alpha=0.9) +
geom_boxplot(alpha=0.4) +
scale_fill_viridis(discrete = TRUE, alpha=0.6) +
geom_hline(yintercept=144988, color="red") +
theme(
legend.position="none",
plot.title = element_text(size=11)
) +
theme_wsj() +
theme(legend.position = "none")+
theme(axis.title=element_text(size=10))+
ggtitle("Salary Vs. Race Boxplot") +
ylab("Salary") + xlab ("Race") +
scale_x_discrete(labels = function(x) str_wrap(x, width = 10))+
scale_y_continuous(labels = scales::comma, limits = c(0, 300000)) +
NULL
Observation: In our boxplot one can visualize both the distribution of the data and the medium and interquartile ranges of the data. When comparing annual salary by race, our data shows that Asians have the higher salaries in comparison to Black and African American and White sample respondents. Interestingly enough, our sample also shows that the medium salary for Black and AA respondents is higher than that of White respondents.
Since most repondents were white, the larger sample size is less prone to skewness due to outliers when comparing it to the other two races portrayed in the plot.
ask_a_manager_2021_race <- ask_a_manager_2021 %>%
group_by(race) %>%
filter(race %in% c("Black or African American", "White"))
t.test(salary ~ race, data = ask_a_manager_2021_race)
##
## Welch Two Sample t-test
##
## data: salary by race
## t = 0.6, df = 610, p-value = 0.5
## alternative hypothesis: true difference in means between group Black or African American and group White is not equal to 0
## 95 percent confidence interval:
## -2140 4021
## sample estimates:
## mean in group Black or African American mean in group White
## 81976 81035
Observation:
When running a T-test on the two racial groups, White and Black or African American, the P-value we get is 0.9. Thus there is no statistically signifcant difference in their annual salary when calculating it based of off this data set.
#create df of the list of states with >10 occurrences
#This allows us to get data for actual states as many data entry errors
#many individuals put multiple states and this helps to fix that
states <- ask_a_manager_2021 %>%
group_by(state) %>%
filter(!is.na(state)) %>%
filter(n()>10) %>%
summarise(mean_salary = mean(salary),
count_state = count(state))
states$fips <- fips(states$state)
statesalarymap <- states %>% select(state,mean_salary)
plot_usmap(data=statesalarymap, values="mean_salary", color="black") + scale_fill_continuous(
low = "white",
high = "light green",
name = "Mean Salary") +
theme(legend.position = "right") +
labs(title="Map of US States based on Average Salary")
# Count by Industry
ask_a_manager_2021 %>%
group_by(industry) %>%
summarise(count=n()) %>%
ggplot(aes(y=fct_reorder(industry,count),x=count))+
geom_col()
# Count by Age Group
ask_a_manager_2021 %>%
group_by(age_group) %>%
summarise(count=n()) %>%
ggplot(aes(y=fct_reorder(age_group,count),x=count))+
geom_col()
# Sorting out the people who are 18+
ask_a_manager_age <- ask_a_manager_2021 %>%
mutate(factor(age_group,levels=c("18-24","25-34","35-44","45-54","55-64","65 or over")))
#there are only 6 women in under 18 category so excluding them for the graph since there is no male data in that category to compare
ask_a_manager_age %>%
filter(age_group %in% c("18-24", "25-34" , "35-44" , "45-54" , "55-64" )) %>%
group_by(age_group, gender) %>%
summarise (median_salary_by_age = median (salary),
mean_salary_by_age = mean(salary)) %>%
ggplot () +
geom_col(aes (age_group, median_salary_by_age, fill = gender), position = position_dodge(width = 0.9), alpha = 0.7)+
geom_point(aes(x = age_group, y=mean_salary_by_age, color = gender))+
theme_bw()+
#formatC(x, format = "d")+
labs(title = "Median Salary by Age Group" , y="Salary" ,x = "Age Group", subtitle = "Points represent mean salary") +
scale_x_discrete(labels = function(x) str_wrap(x, width = 10))+
NULL
ask_a_manager_2021_changed <- ask_a_manager_2021 %>%
mutate(if_changed= if_else(overall_years_of_professional_experience == years_of_experience_in_field, "No", "Yes"))
ask_a_manager_2021_changed %>%
ggplot(aes(x=fct_relevel(overall_years_of_professional_experience,levels=c("1 year or less","2 - 4 years","5-7 years","8 - 10 years","11 - 20 years","21 - 30 years","31 - 40 years","41 years or more")), y=annual_salary, color=if_changed))+
# scatter plot
geom_point(alpha = 0.5)+
# linear smooth line
geom_smooth(method = "lm")+
# legend settings
theme(legend.position = "bottom",
legend.title = element_blank())+
labs(title="Whether changing job affects your salary",
x="Total Years Spent Working",
y="Salary")+
theme_bw()+
scale_x_discrete(labels = function(x) str_wrap(x, width = 10))+
NULL
ask_a_manager_2021_changed %>%
group_by(overall_years_of_professional_experience, if_changed) %>%
summarise (median_salary_over_time = median (salary),
mean_salary_over_time = mean(salary)) %>%
ggplot () +
geom_col(aes (fct_relevel(overall_years_of_professional_experience,levels=c("1 year or less","2 - 4 years","5-7 years","8 - 10 years","11 - 20 years","21 - 30 years","31 - 40 years","41 years or more")), median_salary_over_time, fill=if_changed), position = position_dodge(width = 0.9), alpha = 0.7)+
geom_point(aes(x = overall_years_of_professional_experience, y=mean_salary_over_time, color = if_changed))+
theme_bw()+
scale_x_discrete(labels = function(x) str_wrap(x, width = 10))+
#formatC(x, format = "d")+
labs( title = "Median Salary over time" , y="Salary" ,x = "Prof", subtitle = "Points represent the mean") +
NULL
ggsave("switch_industry.png",width = 12,height = 8,limitsize = F)
Observation:
The greater the amount of experience the greater salary difference between those who switch and those who don’t.
The percentage increase in salary with increasing professional experience is proportional.
The outliers for the data were predominantly those who switched industry.
mosaic::favstats (converted_salary ~ gender, data=ask_a_manager_2021)
| gender | min | Q1 | median | Q3 | max | mean | sd | n | missing |
|---|---|---|---|---|---|---|---|---|---|
| Man | 1 | 6.3e+04 | 9e+04 | 1.28e+05 | 2.62e+05 | 9.67e+04 | 4.3e+04 | 4303 | 0 |
| Woman | 35 | 5.25e+04 | 7.11e+04 | 9.7e+04 | 2.55e+05 | 7.82e+04 | 3.48e+04 | 19626 | 0 |
ask_a_manager_2021 %>%
ggplot( aes(x=gender, y=converted_salary, color=gender)) +
geom_jitter(color="black", size=0.4, alpha=0.9) +
geom_boxplot(alpha=0.3) +
scale_fill_viridis(discrete = TRUE, alpha=0.6) +
theme_wsj() +
theme(
legend.position="none",
plot.title = element_text(size=11)
) +
theme(
legend.position="none",
plot.title = element_text(size=11)
) +
theme(axis.title=element_text(size=12))+
ggtitle("Salary Vs. Gender Boxplot with Jitter") +
ylab("Salary") + xlab ("Race") +
scale_x_discrete(labels = function(x) str_wrap(x, width = 10))+
scale_y_continuous(labels = scales::comma, limits = c(0, 300000)) +
NULL
For our following analysis, we will be comparing data from the male gender “Man” with data from the female gender “Woman.” We will filter out the data on those that are non binary.
ask_a_manager_2021 <- ask_a_manager_2021 %>%
filter(gender!="Non-binary")
t.test(converted_salary ~ gender, data = ask_a_manager_2021)
##
## Welch Two Sample t-test
##
## data: converted_salary by gender
## t = 26, df = 5602, p-value <2e-16
## alternative hypothesis: true difference in means between group Man and group Woman is not equal to 0
## 95 percent confidence interval:
## 17124 19871
## sample estimates:
## mean in group Man mean in group Woman
## 96721 78223
Observation: This T.test proves there is a statistically significant difference in salary between man and woman.
#mosaic::favstats (annual_salary ~ gender, data=ask_a_manager_2021)
ask_a_manager_2021_gender <- ask_a_manager_2021 %>%
mutate(gender2 = case_when(
gender == "Man" ~ "1",
gender == "Woman" ~ "0",
TRUE ~ "NA"
)) %>%
mutate(gender2 = as.numeric(gender2))
mosaic::favstats (converted_salary ~ gender, data=ask_a_manager_2021_gender) %>%
mutate(t_critical = qt(0.975,n-1),
sd_mean = sd/sqrt(n),
margin_of_error = t_critical*sd_mean,
ci_lower = mean-margin_of_error,
ci_higher = mean+margin_of_error)
| gender | min | Q1 | median | Q3 | max | mean | sd | n | missing | t_critical | sd_mean | margin_of_error | ci_lower | ci_higher |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| Man | 1 | 6.3e+04 | 9e+04 | 1.28e+05 | 2.62e+05 | 9.67e+04 | 4.3e+04 | 4303 | 0 | 1.96 | 655 | 1.28e+03 | 9.54e+04 | 9.8e+04 |
| Woman | 35 | 5.25e+04 | 7.11e+04 | 9.7e+04 | 2.55e+05 | 7.82e+04 | 3.48e+04 | 19626 | 0 | 1.96 | 248 | 487 | 7.77e+04 | 7.87e+04 |
Observation: Since two 95% confidence intervals do not overlap, we are at least 95% confident that the male have higher salary than the female on average.
# hypothesis testing using infer package
diff <- ask_a_manager_2021_gender %>%
specify(converted_salary ~ gender) %>%
calculate(stat = "diff in means", order = c("Woman", "Man"))
set.seed(1234)
salary_in_null_world <- ask_a_manager_2021_gender %>%
filter(gender !="Non-binary") %>%
specify(converted_salary ~ gender) %>%
# assuming independence of gender and salary
hypothesize(null = "independence") %>%
# generate 1000 reps, of type "permute"
generate(reps = 1000, type = "permute") %>%
# calculate statistic of difference, namely "diff in means"
calculate(stat = "diff in means", order = c("Woman", "Man"))
salary_in_null_world %>%
visualise()+
shade_p_value(obs_stat = diff, direction = "two-sided")
p_value <- salary_in_null_world %>%
get_pvalue(obs_stat = diff, direction="both")
p_value
| p_value |
|---|
| 0 |
We want to look at the relationship between one’s gender and education degree. In essence, we want to explore if there is a difference in the proportion of respondents with high school degree as their highest degree for male group and the female group. We can use the proportion test to generate the result from the male and female sample.
ask_a_manager_2021_education <- ask_a_manager_2021 %>%
filter(gender !="Non-binary") %>%
mutate(below_college = if_else(highest_level_of_education_completed %in% c("High School", "Some college"), "Yes", "No")) %>%
group_by(gender) %>%
summarise(college_below=sum(below_college == "Yes"), college_or_above = sum(below_college == "No"))
ask_a_manager_2021_education
| gender | college_below | college_or_above |
|---|---|---|
| Man | 635 | 3668 |
| Woman | 1579 | 18047 |
prop.test(x = c(3668,18047), n = c(3668+635,18047+1579), alternative = "two.sided")
##
## 2-sample test for equality of proportions with continuity correction
##
## data: c out of c3668 out of 3668 + 63518047 out of 18047 + 1579
## X-squared = 189, df = 1, p-value <2e-16
## alternative hypothesis: two.sided
## 95 percent confidence interval:
## -0.0785 -0.0557
## sample estimates:
## prop 1 prop 2
## 0.852 0.920
From the result of proportion test for male group and female group, we are 99% confident that the proportion of high school as highest education level for male is higher than that for female, indicating that overall female has a higher college-entry rate vs. the male.
If we repeat the process to explore the difference in proportion in acquiring a masters or above degree for both gender:
ask_a_manager_2021_education <- ask_a_manager_2021 %>%
filter(gender !="Non-binary") %>%
mutate(higher_edu = if_else(highest_level_of_education_completed %in% c("Master's degree", "Professional degree (MD, JD, etc.)", "PhD"), "Yes", "No")) %>%
group_by(gender) %>%
summarise(master_below = sum(higher_edu == "No"), master_or_above=sum(higher_edu == "Yes") )
ask_a_manager_2021_education
| gender | master_below | master_or_above |
|---|---|---|
| Man | 2811 | 1492 |
| Woman | 11068 | 8558 |
prop.test(x = c(1492, 8558), n = c(1492+2811, 8558+11068), alternative = "two.sided")
##
## 2-sample test for equality of proportions with continuity correction
##
## data: c out of c1492 out of 1492 + 28118558 out of 8558 + 11068
## X-squared = 115, df = 1, p-value <2e-16
## alternative hypothesis: two.sided
## 95 percent confidence interval:
## -0.1053 -0.0734
## sample estimates:
## prop 1 prop 2
## 0.347 0.436
From the result of two proportion tests, we can see that even though the proportion of acquiring a college-or-above degree is less in the male group, the proportion of acquiring higher education degree (Master’s-or-above) is higher than the female group. This somehow explains the fact that there are more outliers in male sample with extremely high income level.
Here we want to see if race is affecting the salary.
ask_a_manager_salary <- ask_a_manager_2021 %>%
mutate(if_white = if_else(race == "White", "Yes", "No")) %>%
select(if_white,salary)
t.test(salary~if_white, data = ask_a_manager_salary, success = "Yes")
##
## Welch Two Sample t-test
##
## data: salary by if_white
## t = 7, df = 4948, p-value = 4e-12
## alternative hypothesis: true difference in means between group No and group Yes is not equal to 0
## 95 percent confidence interval:
## 3474 6195
## sample estimates:
## mean in group No mean in group Yes
## 85869 81035
We want to look at the relationship between one’s race and education degree. Particularly, we want to see if there is a difference in education level for those being white and those being non-white.
As the above analysis for gender vs. education, we run two proportion tests for college-or-above degree percentage and master-or-above percentage between the male and female group.
ask_a_manager_2021_race <- ask_a_manager_2021 %>%
mutate(if_white = if_else(race == "White", "White", "Non-white"),
below_college = if_else(highest_level_of_education_completed %in% c( "High School", "Some college"), "Yes", "No")) %>%
group_by(if_white) %>%
summarise(college_below=sum(below_college == "Yes"), college_or_above = sum(below_college == "No"))
ask_a_manager_2021_race
| if_white | college_below | college_or_above |
|---|---|---|
| Non-white | 334 | 3333 |
| White | 1880 | 18382 |
prop.test(x = c(3333,18382), n = c(3333+334, 18382+1880), alternative = "two.sided")
##
## 2-sample test for equality of proportions with continuity correction
##
## data: c out of c3333 out of 3333 + 33418382 out of 18382 + 1880
## X-squared = 0.09, df = 1, p-value = 0.8
## alternative hypothesis: two.sided
## 95 percent confidence interval:
## -0.00859 0.01200
## sample estimates:
## prop 1 prop 2
## 0.909 0.907
Similarly, if we run the test for the propotion of master_or_above education level between white and non-white races:
ask_a_manager_2021_race <- ask_a_manager_2021 %>%
mutate(if_white = if_else(race == "White", "White", "Non-white"),
higher_edu = if_else(highest_level_of_education_completed %in% c("Master's degree", "Professional degree (MD, JD, etc.)", "PhD"), "Yes", "No")) %>%
group_by(if_white) %>%
summarise(master_below = sum(higher_edu == "No"), master_or_above=sum(higher_edu == "Yes") )
ask_a_manager_2021_race
| if_white | master_below | master_or_above |
|---|---|---|
| Non-white | 2256 | 1411 |
| White | 11623 | 8639 |
prop.test(x = c(1411, 8639), n = c(1411+2256, 8629+11623), alternative = "two.sided")
##
## 2-sample test for equality of proportions with continuity correction
##
## data: c out of c1411 out of 1411 + 22568639 out of 8629 + 11623
## X-squared = 22, df = 1, p-value = 3e-06
## alternative hypothesis: two.sided
## 95 percent confidence interval:
## -0.0591 -0.0245
## sample estimates:
## prop 1 prop 2
## 0.385 0.427
Observation: From the result of the proportion test, we cannot see a clear difference between the proportion of education level in white and non-white group, as the p-value is higher than 5%.
However, the proportion of acquiring higher education degree (Master’s-or-above) is higher in the white group vs. other races
t.test(salary~if_changed, data = ask_a_manager_2021_changed)
##
## Welch Two Sample t-test
##
## data: salary by if_changed
## t = 19, df = 23826, p-value <2e-16
## alternative hypothesis: true difference in means between group No and group Yes is not equal to 0
## 95 percent confidence interval:
## 7949 9827
## sample estimates:
## mean in group No mean in group Yes
## 85884 76996
ask_a_manager_age %>%
filter(age_group %in% c("18-24", "25-34" , "35-44" , "45-54" , "55-64" )) %>%
group_by(age_group, gender) %>%
summarise (median_salary_by_age = median (salary),
mean_salary_by_age = mean(salary)) %>%
ggplot () +
geom_col(aes (age_group, median_salary_by_age, fill = gender), position = position_dodge(width = 0.9), alpha = 0.7)+
geom_point(aes(x = age_group, y=mean_salary_by_age, color = gender))+
theme_bw()+
#formatC(x, format = "d")+
labs( title = "Median Salary by Age Group" , y="Salary" ,x = "Age Group") +
NULL
Observation:
Changing industry has a statistically significant affect in average salary in our respondent sample. Those who change salary earn less, in general.
# load population data of US
population_url <- "https://www.ers.usda.gov/webdocs/DataFiles/48747/PopulationEstimates.csv?v=2232"
population <- vroom(population_url) %>%
janitor::clean_names() %>%
# select the latest data, namely 2019
select(fips = fip_stxt, state= area_name, pop_estimate_2019) %>%
group_by(state) %>%
summarise(pop_estimate_2019_m = sum(pop_estimate_2019)/1000000)
reg_data <- ask_a_manager_2021 %>%
filter(!is.na(state)&industry %in% industry_list$industry[1:15]) %>%
group_by(state) %>%
filter(n()>10) %>%
ungroup() %>%
filter(country =="US"¤cy=="USD") %>%
mutate(job = tolower(job_title),
research=grepl("research", job),
manager=grepl("manager", job),
engineer=grepl("engineer", job),
analyst=grepl("analyst", job),
data_related = grepl("data", job),
scientist = grepl("scientist", job),
database = grepl("database", job),
librarian = grepl("librarian", job),
add_context_job = !is.na(additional_context_on_job_title),
add_context_inc = !is.na(additional_context_on_income),
race = case_when(race=="White"~"White",
race=="Asian or Asian American"~"Asian",
race=="Black or African American"~"Black",
T~"Others"),
) %>%
left_join(population,by = c("state")) %>%
select(c("age_group","gender","education","field_exp","pro_exp","race",
"salary","research","manager","engineer","industry","analyst",
"data_related","scientist","database","librarian","add_context_job",
"add_context_inc","pop_estimate_2019_m"))
set.seed(1234)
data_split <- initial_split(reg_data, prop = .7)
data_train <- training(data_split)
data_test <- testing(data_split)
m1 <- lm(salary ~ age_group, data=data_train)
m2 <- lm(salary ~ pro_exp, data=data_train)
m3 <- lm(salary ~ field_exp, data=data_train)
huxreg(m1, m2, m3,
statistics = c('#observations' = 'nobs',
'R squared' = 'r.squared',
'Adj. R Squared' = 'adj.r.squared',
'Residual SE' = 'sigma',
"AIC" = "AIC"),
bold_signif = 0.05,
stars = NULL
) %>%
set_caption('Comparison of models (age or experience?)')
| (1) | (2) | (3) | |
|---|---|---|---|
| (Intercept) | 61635.571 | 69671.994 | 65433.717 |
| (1832.014) | (2855.294) | (1625.252) | |
| age_group25-34 | 20407.709 | ||
| (1904.286) | |||
| age_group35-44 | 30716.137 | ||
| (1921.989) | |||
| age_group45-54 | 33106.375 | ||
| (2096.570) | |||
| age_group55-64 | 22772.191 | ||
| (2587.712) | |||
| age_group65 or over | 30382.329 | ||
| (5556.773) | |||
| pro_exp2 - 4 years | 1345.220 | ||
| (3057.459) | |||
| pro_exp5-7 years | 8494.468 | ||
| (2973.095) | |||
| pro_exp8 - 10 years | 16057.951 | ||
| (2961.636) | |||
| pro_exp11 - 20 years | 22547.278 | ||
| (2914.158) | |||
| pro_exp21 - 30 years | 27128.221 | ||
| (3013.121) | |||
| pro_exp31 - 40 years | 21967.309 | ||
| (3475.103) | |||
| pro_exp41 years or more | 18277.526 | ||
| (5948.339) | |||
| field_exp2 - 4 years | 6838.122 | ||
| (1781.163) | |||
| field_exp5-7 years | 17214.794 | ||
| (1763.885) | |||
| field_exp8 - 10 years | 25866.721 | ||
| (1810.061) | |||
| field_exp11 - 20 years | 32427.263 | ||
| (1767.441) | |||
| field_exp21 - 30 years | 40462.832 | ||
| (2095.928) | |||
| field_exp31 - 40 years | 32929.075 | ||
| (3288.613) | |||
| field_exp41 years or more | 41222.125 | ||
| (8428.581) | |||
| #observations | 11366 | 11366 | 11366 |
| R squared | 0.036 | 0.047 | 0.090 |
| Adj. R Squared | 0.036 | 0.046 | 0.089 |
| Residual SE | 37095.450 | 36898.532 | 36049.843 |
| AIC | 271432.355 | 271313.361 | 270784.405 |
# add more variables to explore and improve accuracy
m4 <- lm(salary ~ field_exp+gender, data=data_train)
m5 <- lm(salary ~ field_exp+gender+gender*field_exp, data=data_train)
m6 <- lm(salary ~ field_exp+gender+education, data=data_train)
m7 <- lm(salary ~ field_exp+gender+gender*field_exp+education+race, data=data_train)
m8 <- lm(salary ~ field_exp+gender+gender*field_exp+education+race+pop_estimate_2019_m, data=data_train)
m9 <- lm(salary ~ field_exp+gender+gender*field_exp+education+race+pop_estimate_2019_m++research+manager+engineer+industry+data_related+scientist+database+librarian, data=data_train)
m10 <- lm(salary ~ field_exp+gender+gender*field_exp+education+race+pop_estimate_2019_m++research+manager+engineer+industry+data_related+scientist+database+librarian+add_context_job+add_context_inc, data=data_train)
m11 <- lm(salary ~ field_exp+gender+gender*field_exp+education+race+pop_estimate_2019_m++research+manager+engineer+industry+data_related+scientist+database+librarian+add_context_job+add_context_inc+data_related*scientist, data=data_train)
huxreg(m4, m5, m6, m7, m8, m9,m10,m11,
statistics = c('#observations' = 'nobs',
'R squared' = 'r.squared',
'Adj. R Squared' = 'adj.r.squared',
'Residual SE' = 'sigma',
"AIC" = "AIC"),
bold_signif = 0.05
) %>%
set_caption('Comparison of models (adding variables)')
| (1) | (2) | (3) | (4) | (5) | (6) | (7) | (8) | |
|---|---|---|---|---|---|---|---|---|
| (Intercept) | 82483.049 *** | 69628.934 *** | 61442.370 *** | 68574.888 *** | 57853.358 *** | 42915.253 *** | 43368.902 *** | 43282.577 *** |
| (1765.664) | (4507.873) | (3166.399) | (5296.351) | (5263.800) | (4598.579) | (4594.684) | (4594.386) | |
| field_exp2 - 4 years | 6196.971 *** | 11580.886 * | 5982.387 *** | 13174.453 ** | 13505.674 ** | 10899.235 ** | 10947.596 ** | 11045.310 ** |
| (1743.832) | (4852.730) | (1703.013) | (4688.376) | (4627.460) | (3968.051) | (3963.290) | (3963.175) | |
| field_exp5-7 years | 16377.048 *** | 27928.293 *** | 16152.069 *** | 29079.987 *** | 29292.137 *** | 26469.305 *** | 26458.740 *** | 26543.246 *** |
| (1727.091) | (4796.247) | (1688.233) | (4633.690) | (4573.462) | (3922.239) | (3917.520) | (3917.328) | |
| field_exp8 - 10 years | 25199.227 *** | 38864.445 *** | 24544.398 *** | 40456.388 *** | 40503.349 *** | 37719.869 *** | 37833.908 *** | 37957.127 *** |
| (1772.137) | (4913.937) | (1733.477) | (4748.284) | (4686.551) | (4021.405) | (4016.630) | (4016.700) | |
| field_exp11 - 20 years | 31141.420 *** | 47860.450 *** | 31052.911 *** | 49988.943 *** | 50798.309 *** | 46225.827 *** | 46395.762 *** | 46503.502 *** |
| (1731.126) | (4768.854) | (1694.291) | (4608.543) | (4548.865) | (3905.641) | (3901.098) | (3901.069) | |
| field_exp21 - 30 years | 37525.013 *** | 60021.389 *** | 38730.943 *** | 64272.212 *** | 64767.293 *** | 59023.009 *** | 59130.650 *** | 59252.556 *** |
| (2055.960) | (5137.828) | (2011.519) | (4966.416) | (4901.929) | (4208.708) | (4203.698) | (4203.712) | |
| field_exp31 - 40 years | 30935.521 *** | 46038.954 *** | 32456.827 *** | 49881.389 *** | 51156.019 *** | 43022.829 *** | 43078.827 *** | 43222.879 *** |
| (3220.494) | (7399.567) | (3146.731) | (7153.386) | (7060.764) | (6079.513) | (6072.245) | (6072.030) | |
| field_exp41 years or more | 35440.467 *** | 43913.566 *** | 37627.381 *** | 50078.983 *** | 48625.778 *** | 41841.212 *** | 41905.812 *** | 42027.137 *** |
| (8254.891) | (13238.881) | (8059.436) | (12791.714) | (12625.683) | (10823.046) | (10810.091) | (10809.049) | |
| genderWoman | -19462.346 *** | -4788.972 | -21123.723 *** | -5613.328 | -5308.446 | 3823.444 | 3746.403 | 3870.918 |
| (874.130) | (4816.323) | (858.022) | (4652.743) | (4592.285) | (3939.171) | (3934.651) | (3934.753) | |
| field_exp2 - 4 years:genderWoman | -5812.696 | -7175.929 | -7008.995 | -5215.950 | -5146.048 | -5235.200 | ||
| (5198.644) | (5022.080) | (4956.796) | (4249.195) | (4244.143) | (4243.921) | |||
| field_exp5-7 years:genderWoman | -13109.252 * | -13194.269 ** | -13241.990 ** | -10388.510 * | -10230.593 * | -10314.224 * | ||
| (5139.849) | (4965.449) | (4900.892) | (4200.445) | (4195.502) | (4195.256) | |||
| field_exp8 - 10 years:genderWoman | -15637.001 ** | -16538.217 ** | -16070.142 ** | -12689.203 ** | -12650.008 ** | -12762.324 ** | ||
| (5267.091) | (5088.890) | (5022.800) | (4306.057) | (4300.910) | (4300.829) | |||
| field_exp11 - 20 years:genderWoman | -19445.181 *** | -19952.011 *** | -20466.473 *** | -13887.439 *** | -13903.459 *** | -14009.758 *** | ||
| (5117.237) | (4945.171) | (4880.967) | (4186.693) | (4181.661) | (4181.561) | |||
| field_exp21 - 30 years:genderWoman | -27971.809 *** | -29147.813 *** | -29545.621 *** | -21768.894 *** | -21679.884 *** | -21795.637 *** | ||
| (5619.375) | (5429.326) | (5358.786) | (4596.536) | (4591.046) | (4590.930) | |||
| field_exp31 - 40 years:genderWoman | -17581.031 * | -17503.351 * | -17950.723 * | -7648.653 | -7433.630 | -7553.913 | ||
| (8228.457) | (7949.878) | (7846.561) | (6748.574) | (6740.577) | (6740.108) | |||
| field_exp41 years or more:genderWoman | -7106.165 | -10096.647 | -11137.299 | 337.307 | -396.326 | -543.267 | ||
| (17053.820) | (16478.297) | (16264.168) | (13939.868) | (13924.525) | (13923.153) | |||
| educationSome college | 6664.759 * | 6636.025 * | 6334.711 * | 5116.750 * | 5004.721 * | 4982.621 * | ||
| (2996.167) | (2959.619) | (2921.192) | (2503.433) | (2500.511) | (2500.253) | |||
| educationCollege degree | 20794.145 *** | 20287.413 *** | 19796.872 *** | 18576.157 *** | 18436.607 *** | 18425.183 *** | ||
| (2747.005) | (2714.021) | (2678.884) | (2303.248) | (2300.621) | (2300.366) | |||
| educationMaster's degree | 23589.736 *** | 23335.704 *** | 23333.386 *** | 28704.406 *** | 28548.266 *** | 28515.267 *** | ||
| (2765.094) | (2731.568) | (2696.054) | (2327.655) | (2325.034) | (2324.834) | |||
| educationProfessional degree (MD, JD, etc.) | 45943.690 *** | 45707.334 *** | 45647.748 *** | 52250.081 *** | 52182.832 *** | 52132.988 *** | ||
| (3081.052) | (3043.512) | (3003.945) | (2692.734) | (2689.748) | (2689.570) | |||
| educationPhD | 36354.897 *** | 36615.006 *** | 36387.719 *** | 45722.324 *** | 45422.768 *** | 45417.812 *** | ||
| (3081.974) | (3044.832) | (3005.274) | (2632.831) | (2630.980) | (2630.682) | |||
| raceBlack | -18415.931 *** | -15219.157 *** | -10056.790 *** | -9991.410 *** | -10010.249 *** | |||
| (2494.993) | (2469.422) | (2120.425) | (2117.969) | (2117.751) | ||||
| raceOthers | -19825.444 *** | -17587.033 *** | -11492.685 *** | -11466.127 *** | -11474.295 *** | |||
| (1862.273) | (1842.573) | (1582.357) | (1580.617) | (1580.442) | ||||
| raceWhite | -23129.046 *** | -18850.259 *** | -11922.140 *** | -11797.448 *** | -11823.976 *** | |||
| (1543.285) | (1543.007) | (1327.160) | (1325.828) | (1325.751) | ||||
| pop_estimate_2019_m | 496.117 *** | 414.710 *** | 413.930 *** | 413.058 *** | ||||
| (28.559) | (24.571) | (24.546) | (24.547) | |||||
| researchTRUE | -1355.889 | -1207.748 | -875.806 | |||||
| (1799.811) | (1797.860) | (1806.191) | ||||||
| managerTRUE | 8292.792 *** | 8229.515 *** | 8232.859 *** | |||||
| (719.191) | (718.516) | (718.436) | ||||||
| engineerTRUE | 22900.445 *** | 22672.079 *** | 22686.156 *** | |||||
| (1090.997) | (1090.502) | (1090.403) | ||||||
| industryBusiness or Consulting | 14160.591 *** | 14381.124 *** | 14444.830 *** | |||||
| (1737.439) | (1735.976) | (1736.104) | ||||||
| industryComputing or Tech | 19739.674 *** | 19520.670 *** | 19519.668 *** | |||||
| (1217.430) | (1217.248) | (1217.110) | ||||||
| industryEducation (Higher Education) | -24655.285 *** | -24421.383 *** | -24394.782 *** | |||||
| (1321.490) | (1321.082) | (1321.006) | ||||||
| industryEducation (Primary/Secondary) | -27573.822 *** | -27838.305 *** | -27799.251 *** | |||||
| (1690.342) | (1690.152) | (1690.085) | ||||||
| industryEngineering or Manufacturing | -3540.355 * | -3336.457 * | -3256.983 * | |||||
| (1490.069) | (1488.809) | (1489.232) | ||||||
| industryGovernment and Public Administration | -6801.635 *** | -6551.443 *** | -6437.312 *** | |||||
| (1402.622) | (1401.703) | (1402.840) | ||||||
| industryHealth care | -4316.280 ** | -4243.986 ** | -4164.012 ** | |||||
| (1354.116) | (1352.599) | (1353.105) | ||||||
| industryInsurance | -3274.070 | -3313.004 | -3312.386 | |||||
| (1986.342) | (1984.418) | (1984.192) | ||||||
| industryLaw | -5477.892 ** | -5400.347 ** | -5376.811 ** | |||||
| (1785.729) | (1783.907) | (1783.747) | ||||||
| industryMarketing, Advertising & PR | 450.697 | 378.725 | 363.498 | |||||
| (1561.097) | (1559.309) | (1559.152) | ||||||
| industryMedia & Digital | -4514.505 * | -4483.965 * | -4457.684 * | |||||
| (1775.735) | (1773.998) | (1773.850) | ||||||
| industryNonprofits | -15804.795 *** | -15631.462 *** | -15611.513 *** | |||||
| (1270.969) | (1269.925) | (1269.824) | ||||||
| industryRecruitment or HR | -2879.708 | -3112.659 | -3128.515 | |||||
| (2063.699) | (2061.669) | (2061.451) | ||||||
| industryRetail | -20691.092 *** | -20791.356 *** | -20777.168 *** | |||||
| (2110.220) | (2108.144) | (2107.917) | ||||||
| data_relatedTRUE | 7365.560 *** | 7170.066 *** | 5791.150 ** | |||||
| (1884.875) | (1883.043) | (2018.888) | ||||||
| scientistTRUE | 12911.077 *** | 12713.710 *** | 9426.006 ** | |||||
| (2605.884) | (2603.049) | (3129.243) | ||||||
| databaseTRUE | -16577.967 * | -16550.496 * | -15181.333 * | |||||
| (6445.897) | (6438.143) | (6477.934) | ||||||
| librarianTRUE | -11263.970 *** | -11402.890 *** | -11437.726 *** | |||||
| (2118.660) | (2116.375) | (2116.214) | ||||||
| add_context_jobTRUE | -2928.628 *** | -2916.277 *** | ||||||
| (623.222) | (623.185) | |||||||
| add_context_incTRUE | 2995.721 ** | 2993.794 ** | ||||||
| (919.047) | (918.943) | |||||||
| data_relatedTRUE:scientistTRUE | 10379.524 | |||||||
| (5484.449) | ||||||||
| #observations | 11366 | 11366 | 11366 | 11366 | 11366 | 11366 | 11366 | 11366 |
| R squared | 0.128 | 0.133 | 0.170 | 0.191 | 0.212 | 0.423 | 0.425 | 0.425 |
| Adj. R Squared | 0.127 | 0.131 | 0.169 | 0.190 | 0.211 | 0.421 | 0.422 | 0.423 |
| Residual SE | 35289.480 | 35207.617 | 34435.944 | 34006.685 | 33564.555 | 28743.282 | 28708.695 | 28705.422 |
| AIC | 270300.812 | 270255.011 | 269749.237 | 269474.075 | 269177.590 | 265673.551 | 265648.174 | 265646.577 |
| *** p < 0.001; ** p < 0.01; * p < 0.05. | ||||||||
# calculate rmse on test set of 11 models
models <- list(m1,m2,m3,m4,m5,m6,m7,m8,m9,m10,m11)
rmses <- c()
i <- 0
for (model in models){
i = i + 1
print(paste("model",i))
preds <- predict(model, data_test)
rmses[i] <- RMSE(
pred = preds,
obs = data_test$salary
)
print(paste("RMSE on train data:",summary(model)$sigma))
print(paste("RMSE on test data:", rmses[i]))
SS.total <- with(data_test,sum((salary-mean(salary))^2))
#SS.residual <- sum(residuals(model)^2)
SS.regression <- sum((preds-mean(data_test$salary))^2)
#SS.total - (SS.regression+SS.residual)
print(paste("R Squared on train data", summary(model)$r.squared))
print(paste("R Squared on test data", SS.regression/SS.total)) # fraction of variation explained by the model
}
## [1] "model 1"
## [1] "RMSE on train data: 37095.4502501071"
## [1] "RMSE on test data: 36985.4136059742"
## [1] "R Squared on train data 0.0362110702473326"
## [1] "R Squared on test data 0.0373349296083914"
## [1] "model 2"
## [1] "RMSE on train data: 36898.5318250807"
## [1] "RMSE on test data: 36597.9066711955"
## [1] "R Squared on train data 0.0465841987199754"
## [1] "R Squared on test data 0.0476371751180191"
## [1] "model 3"
## [1] "RMSE on train data: 36049.8427580782"
## [1] "RMSE on test data: 35753.5340710996"
## [1] "R Squared on train data 0.089938121540009"
## [1] "R Squared on test data 0.0926641460655427"
## [1] "model 4"
## [1] "RMSE on train data: 35289.4799778895"
## [1] "RMSE on test data: 34924.6767727815"
## [1] "R Squared on train data 0.128000066311433"
## [1] "R Squared on test data 0.132199529165854"
## [1] "model 5"
## [1] "RMSE on train data: 35207.617443797"
## [1] "RMSE on test data: 34830.0983571456"
## [1] "R Squared on train data 0.132575981039106"
## [1] "R Squared on test data 0.137322501017178"
## [1] "model 6"
## [1] "RMSE on train data: 34435.9443290408"
## [1] "RMSE on test data: 34127.075354642"
## [1] "R Squared on train data 0.170037088957289"
## [1] "R Squared on test data 0.17746586804058"
## [1] "model 7"
## [1] "RMSE on train data: 34006.6853010122"
## [1] "RMSE on test data: 33893.6113771057"
## [1] "R Squared on train data 0.191312823275399"
## [1] "R Squared on test data 0.200353434956173"
## [1] "model 8"
## [1] "RMSE on train data: 33564.5551861195"
## [1] "RMSE on test data: 33554.0911706585"
## [1] "R Squared on train data 0.212273508211821"
## [1] "R Squared on test data 0.22268113081223"
## [1] "model 9"
## [1] "RMSE on train data: 28743.2816284638"
## [1] "RMSE on test data: 28711.9924589569"
## [1] "R Squared on train data 0.423390974628695"
## [1] "R Squared on test data 0.413260520145657"
## [1] "model 10"
## [1] "RMSE on train data: 28708.6953277177"
## [1] "RMSE on test data: 28676.5016877011"
## [1] "R Squared on train data 0.424879416748037"
## [1] "R Squared on test data 0.415327383057971"
## [1] "model 11"
## [1] "RMSE on train data: 28705.4215821016"
## [1] "RMSE on test data: 28649.524717651"
## [1] "R Squared on train data 0.425061378093912"
## [1] "R Squared on test data 0.415699021430563"
# for model 10, comparing to 28657.84 RMSE in training data.
# the RMSE 29507 is not very large indicating there is not very heavy over fitting problem